##lOADING ALL THE PACKAGES

rm(list=ls(all=TRUE))

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(MASS)
library(vegan)
## Loading required package: permute
## This is vegan 2.5-7
## 
## Attaching package: 'vegan'
## The following object is masked from 'package:caret':
## 
##     tolerance
library(data.table)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(e1071)
library(standardize)
## 
##  *********************************************************** 
##           Loading standardize package version 0.2.2          
##      Call standardize.news() to see new features/changes     
##  ***********************************************************
library(ggplot2)
library(arules)
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

Reading the various datasets

setwd("~/Linear and Non Linear/Data")
one=read.csv("Train.csv",na.strings=c("","NA"))
two=read.csv("Train_Claim.csv",na.strings=c("","NA"))
three=read.csv("Train_Demographics.csv",na.strings=c("","NA"))
four=read.csv("Train_Policy.csv",na.strings=c("","NA"))
five=read.csv("Train_Vehicle.csv")
#Converting all the missing values to NA

#for Data of VEhicle
five$VehicleAttributeDetails=as.factor(ifelse(five$VehicleAttributeDetails=="???",NA,five$VehicleAttributeDetails))

#for Claim INformation
two$TypeOfCollission=as.factor(ifelse(two$TypeOfCollission=="?",NA,two$TypeOfCollission))
two$IncidentTime=as.factor(ifelse(two$IncidentTime=="-5",NA,two$IncidentTime))
two$PropertyDamage=as.factor(ifelse(two$PropertyDamage=="?",NA,two$PropertyDamage))
two$Witnesses=as.factor(ifelse(two$Witnesses=="MISSINGVALUE",NA,two$Witnesses))
two$PoliceReport=as.factor(ifelse(two$PoliceReport=="?",NA,two$PoliceReport))
two$AmountOfTotalClaim=(ifelse(two$AmountOfTotalClaim=="MISSEDDATA",NA,two$AmountOfTotalClaim))

#for policy  INFORMATIOn
four$PolicyAnnualPremium=(ifelse(four$PolicyAnnualPremium=="-1",NA,four$PolicyAnnualPremium))
colSums(is.na(five))
##              CustomerID        VehicleAttribute VehicleAttributeDetails 
##                       0                       0                      50
colSums(is.na(two))
##            CustomerID        DateOfIncident        TypeOfIncident 
##                     0                     0                     0 
##      TypeOfCollission    SeverityOfIncident  AuthoritiesContacted 
##                  5162                     0                     0 
##         IncidentState          IncidentCity       IncidentAddress 
##                     0                     0                     0 
##          IncidentTime      NumberOfVehicles        PropertyDamage 
##                    31                     0                 10459 
##        BodilyInjuries             Witnesses          PoliceReport 
##                     0                    46                  9805 
##    AmountOfTotalClaim   AmountOfInjuryClaim AmountOfPropertyClaim 
##                    50                     0                     0 
## AmountOfVehicleDamage 
##                     0
colSums(is.na(four))
##      InsurancePolicyNumber      CustomerLoyaltyPeriod 
##                          0                          0 
##       DateOfPolicyCoverage       InsurancePolicyState 
##                          0                          0 
## Policy_CombinedSingleLimit          Policy_Deductible 
##                          0                          0 
##        PolicyAnnualPremium              UmbrellaLimit 
##                        141                          0 
##        InsuredRelationship                 CustomerID 
##                          0                          0
colSums(is.na(three))
##            CustomerID            InsuredAge        InsuredZipCode 
##                     0                     0                     0 
##         InsuredGender InsuredEducationLevel     InsuredOccupation 
##                    30                     0                     0 
##        InsuredHobbies          CapitalGains           CapitalLoss 
##                     0                     0                     0 
##               Country 
##                     2

Vehicle table has its attribute in a the vertical format, so connverting it into the horizontal format

five_one=five[five$VehicleAttribute %in% c("VehicleID"),]
five_one$VehicleAttributeDetails=as.factor(as.character(five_one$VehicleAttributeDetails))
five_one=five_one[,-2]
str(five_one)       # we can delete Five_one i.e VEHICLE ID as iT OBVIOUSLY HAS all different factors
## 'data.frame':    28836 obs. of  2 variables:
##  $ CustomerID             : chr  "Cust20179" "Cust13038" "Cust1801" "Cust14947" ...
##  $ VehicleAttributeDetails: Factor w/ 28836 levels "Vehicle10000",..: 28005 16377 11360 4017 15386 9527 16914 25689 23654 23148 ...
five_two=five[five$VehicleAttribute %in% c("VehicleMake"),] 
five_two$VehicleAttributeDetails2=as.factor(as.character(five_two$VehicleAttributeDetails))
str(five_two)
## 'data.frame':    28836 obs. of  4 variables:
##  $ CustomerID              : chr  "Cust33335" "Cust20624" "Cust9006" "Cust18447" ...
##  $ VehicleAttribute        : chr  "VehicleMake" "VehicleMake" "VehicleMake" "VehicleMake" ...
##  $ VehicleAttributeDetails : Factor w/ 28910 levels "1995","1996",..: 69 34 29 47 31 64 66 30 30 69 ...
##  $ VehicleAttributeDetails2: Factor w/ 14 levels "Accura","Audi",..: 13 4 1 7 3 11 12 2 2 13 ...
five_two=five_two[,c(-2,-3)]
five_three=five[five$VehicleAttribute %in% c("VehicleModel"),] 
five_three$VehicleAttributeDetails3=as.factor(as.character(five_three$VehicleAttributeDetails))
five_three=five_three[,c(-2,-3)]
five_four=five[five$VehicleAttribute %in% c("VehicleYOM"),]
five_four$VehicleAttributeDetails4=as.factor(as.character(five_four$VehicleAttributeDetails))
five_four=five_four[,c(-2,-3)]
str(five_four)
## 'data.frame':    28836 obs. of  2 variables:
##  $ CustomerID              : chr  "Cust21334" "Cust26634" "Cust21432" "Cust22845" ...
##  $ VehicleAttributeDetails4: Factor w/ 21 levels "1995","1996",..: 2 5 8 6 9 4 19 8 17 7 ...

#merging all the csv files

merge1 <- merge(five_three,five_two,by="CustomerID")
merge2 <- merge(merge1,five_four,by="CustomerID")
merge3 <- merge(merge2,four,by="CustomerID")
merge4 <- merge(merge3,three,by="CustomerID")
total <- merge(merge4,two,by="CustomerID")
total$InsurancePolicyNumber=as.factor(total$InsurancePolicyNumber)
#deleting customer ID and insurance policy number 
total=total[,setdiff(names(total),c("InsurancePolicyNumber","Country","AmountOfPropertyClaim_disc","AmountOfTotalClaim_disc","AmountOfInjuryClaim_disc"))]
total$InsuredZipCode=as.factor(as.integer(total$InsuredZipCode/1000))
str(total)
## 'data.frame':    28836 obs. of  38 variables:
##  $ CustomerID                : chr  "Cust10000" "Cust10001" "Cust10002" "Cust10003" ...
##  $ VehicleAttributeDetails3  : Factor w/ 39 levels "3 Series","92x",..: 6 6 21 21 12 12 8 8 29 2 ...
##  $ VehicleAttributeDetails2  : Factor w/ 14 levels "Accura","Audi",..: 2 2 14 14 13 13 9 12 14 11 ...
##  $ VehicleAttributeDetails4  : Factor w/ 21 levels "1995","1996",..: 14 12 5 9 16 17 6 16 1 10 ...
##  $ CustomerLoyaltyPeriod     : int  49 114 167 190 115 101 471 340 81 328 ...
##  $ DateOfPolicyCoverage      : chr  "1998-10-25" "2000-11-15" "2001-02-12" "2005-04-11" ...
##  $ InsurancePolicyState      : chr  "State1" "State1" "State3" "State2" ...
##  $ Policy_CombinedSingleLimit: chr  "100/300" "100/300" "500/1000" "500/1000" ...
##  $ Policy_Deductible         : int  1000 1000 617 722 500 500 512 877 2000 1000 ...
##  $ PolicyAnnualPremium       : num  1633 1255 1373 1338 1354 ...
##  $ UmbrellaLimit             : int  0 0 0 0 4279863 3921366 165819 5282219 0 0 ...
##  $ InsuredRelationship       : chr  "not-in-family" "not-in-family" "wife" "own-child" ...
##  $ InsuredAge                : int  35 36 33 36 29 28 57 49 27 48 ...
##  $ InsuredZipCode            : Factor w/ 71 levels "430","431","432",..: 25 25 54 45 28 28 47 47 3 37 ...
##  $ InsuredGender             : chr  "MALE" "MALE" "MALE" "MALE" ...
##  $ InsuredEducationLevel     : chr  "JD" "JD" "JD" "JD" ...
##  $ InsuredOccupation         : chr  "armed-forces" "tech-support" "armed-forces" "armed-forces" ...
##  $ InsuredHobbies            : chr  "movies" "cross-fit" "polo" "polo" ...
##  $ CapitalGains              : int  56700 70600 66400 47900 0 0 67400 67400 56400 53300 ...
##  $ CapitalLoss               : int  -48500 -48500 -63700 -73400 -41500 -41500 0 0 -32800 0 ...
##  $ DateOfIncident            : chr  "2015-02-03" "2015-02-02" "2015-01-15" "2015-01-19" ...
##  $ TypeOfIncident            : chr  "Multi-vehicle Collision" "Multi-vehicle Collision" "Single Vehicle Collision" "Single Vehicle Collision" ...
##  $ TypeOfCollission          : Factor w/ 3 levels "Front Collision",..: 3 3 3 3 2 2 1 1 1 3 ...
##  $ SeverityOfIncident        : chr  "Total Loss" "Total Loss" "Minor Damage" "Minor Damage" ...
##  $ AuthoritiesContacted      : chr  "Police" "Police" "Other" "Other" ...
##  $ IncidentState             : chr  "State7" "State7" "State8" "State9" ...
##  $ IncidentCity              : chr  "City1" "City5" "City6" "City6" ...
##  $ IncidentAddress           : chr  "Location 1311" "Location 1311" "Location 2081" "Location 2081" ...
##  $ IncidentTime              : Factor w/ 24 levels "0","1","2","3",..: 18 11 23 23 11 8 21 19 4 6 ...
##  $ NumberOfVehicles          : int  3 3 1 1 1 1 1 1 3 1 ...
##  $ PropertyDamage            : Factor w/ 2 levels "NO","YES": NA 2 2 2 1 1 NA NA 2 2 ...
##  $ BodilyInjuries            : int  1 2 2 2 2 1 0 0 0 1 ...
##  $ Witnesses                 : Factor w/ 4 levels "0","1","2","3": 1 2 4 4 2 3 3 3 1 3 ...
##  $ PoliceReport              : Factor w/ 2 levels "NO","YES": NA 2 1 1 2 NA 1 1 NA 2 ...
##  $ AmountOfTotalClaim        : chr  "65501" "61382" "66755" "66243" ...
##  $ AmountOfInjuryClaim       : int  13417 15560 11630 12003 8829 7818 6476 5738 6788 6510 ...
##  $ AmountOfPropertyClaim     : int  6071 5919 11630 12003 7234 8132 12822 7333 7504 13020 ...
##  $ AmountOfVehicleDamage     : int  46013 39903 43495 42237 37481 37217 58155 47498 53584 52080 ...
#trying to fill NA's of insured gender using insured relationship
total$InsuredGender=as.factor(ifelse(total$InsuredRelationship=="wife","2",total$InsuredGender))
total$InsuredGender=as.factor(ifelse(total$InsuredRelationship=="husband","1",total$InsuredGender))

#Central imputation on the enitre dataset
finaldata=centralImputation(total)

###feature engineering

#finding the number of days incident happened before accident
finaldata$DateOfIncident=as.Date(finaldata$DateOfIncident, format = "%Y-%m-%d")
finaldata$DateOfPolicyCoverage=as.Date(finaldata$DateOfPolicyCoverage, format = "%Y-%m-%d")
finaldata$incident_coverage=as.integer(finaldata$DateOfIncident-finaldata$DateOfPolicyCoverage)
finaldata$daysremainingforinsurance=finaldata$InsuredAge*365-finaldata$incident_coverage
final <- merge(one,finaldata,by="CustomerID")
final=final[,-1]
str(final)
## 'data.frame':    28836 obs. of  40 variables:
##  $ ReportedFraud             : chr  "N" "N" "N" "N" ...
##  $ VehicleAttributeDetails3  : Factor w/ 39 levels "3 Series","92x",..: 6 6 21 21 12 12 8 8 29 2 ...
##  $ VehicleAttributeDetails2  : Factor w/ 14 levels "Accura","Audi",..: 2 2 14 14 13 13 9 12 14 11 ...
##  $ VehicleAttributeDetails4  : Factor w/ 21 levels "1995","1996",..: 14 12 5 9 16 17 6 16 1 10 ...
##  $ CustomerLoyaltyPeriod     : int  49 114 167 190 115 101 471 340 81 328 ...
##  $ DateOfPolicyCoverage      : Date, format: "1998-10-25" "2000-11-15" ...
##  $ InsurancePolicyState      : chr  "State1" "State1" "State3" "State2" ...
##  $ Policy_CombinedSingleLimit: chr  "100/300" "100/300" "500/1000" "500/1000" ...
##  $ Policy_Deductible         : int  1000 1000 617 722 500 500 512 877 2000 1000 ...
##  $ PolicyAnnualPremium       : num  1633 1255 1373 1338 1354 ...
##  $ UmbrellaLimit             : int  0 0 0 0 4279863 3921366 165819 5282219 0 0 ...
##  $ InsuredRelationship       : chr  "not-in-family" "not-in-family" "wife" "own-child" ...
##  $ InsuredAge                : int  35 36 33 36 29 28 57 49 27 48 ...
##  $ InsuredZipCode            : Factor w/ 71 levels "430","431","432",..: 25 25 54 45 28 28 47 47 3 37 ...
##  $ InsuredGender             : Factor w/ 3 levels "1","2","3": 3 3 1 3 2 2 3 3 2 1 ...
##  $ InsuredEducationLevel     : chr  "JD" "JD" "JD" "JD" ...
##  $ InsuredOccupation         : chr  "armed-forces" "tech-support" "armed-forces" "armed-forces" ...
##  $ InsuredHobbies            : chr  "movies" "cross-fit" "polo" "polo" ...
##  $ CapitalGains              : int  56700 70600 66400 47900 0 0 67400 67400 56400 53300 ...
##  $ CapitalLoss               : int  -48500 -48500 -63700 -73400 -41500 -41500 0 0 -32800 0 ...
##  $ DateOfIncident            : Date, format: "2015-02-03" "2015-02-02" ...
##  $ TypeOfIncident            : chr  "Multi-vehicle Collision" "Multi-vehicle Collision" "Single Vehicle Collision" "Single Vehicle Collision" ...
##  $ TypeOfCollission          : Factor w/ 3 levels "Front Collision",..: 3 3 3 3 2 2 1 1 1 3 ...
##  $ SeverityOfIncident        : chr  "Total Loss" "Total Loss" "Minor Damage" "Minor Damage" ...
##  $ AuthoritiesContacted      : chr  "Police" "Police" "Other" "Other" ...
##  $ IncidentState             : chr  "State7" "State7" "State8" "State9" ...
##  $ IncidentCity              : chr  "City1" "City5" "City6" "City6" ...
##  $ IncidentAddress           : chr  "Location 1311" "Location 1311" "Location 2081" "Location 2081" ...
##  $ IncidentTime              : Factor w/ 24 levels "0","1","2","3",..: 18 11 23 23 11 8 21 19 4 6 ...
##  $ NumberOfVehicles          : int  3 3 1 1 1 1 1 1 3 1 ...
##  $ PropertyDamage            : Factor w/ 2 levels "NO","YES": 1 2 2 2 1 1 1 1 2 2 ...
##  $ BodilyInjuries            : int  1 2 2 2 2 1 0 0 0 1 ...
##  $ Witnesses                 : Factor w/ 4 levels "0","1","2","3": 1 2 4 4 2 3 3 3 1 3 ...
##  $ PoliceReport              : Factor w/ 2 levels "NO","YES": 1 2 1 1 2 1 1 1 1 2 ...
##  $ AmountOfTotalClaim        : chr  "65501" "61382" "66755" "66243" ...
##  $ AmountOfInjuryClaim       : int  13417 15560 11630 12003 8829 7818 6476 5738 6788 6510 ...
##  $ AmountOfPropertyClaim     : int  6071 5919 11630 12003 7234 8132 12822 7333 7504 13020 ...
##  $ AmountOfVehicleDamage     : int  46013 39903 43495 42237 37481 37217 58155 47498 53584 52080 ...
##  $ incident_coverage         : int  5945 5192 5085 3570 6650 5585 7286 7994 6117 100 ...
##  $ daysremainingforinsurance : num  6830 7948 6960 9570 3935 ...

#Distribution of y variable

table(final$ReportedFraud)
## 
##     N     Y 
## 21051  7785

###Checking for correlation between the numeric variables

nums <- unlist(lapply(final, is.numeric))  
numeric_data = final[,nums]
cor = round(cor(numeric_data),1)
cor[upper.tri(cor)] = " "
cat("Correlation Plot\n")
## Correlation Plot
cor
##                           CustomerLoyaltyPeriod Policy_Deductible
## CustomerLoyaltyPeriod     "1"                   " "              
## Policy_Deductible         "0.1"                 "1"              
## PolicyAnnualPremium       "0"                   "0"              
## UmbrellaLimit             "0"                   "0"              
## InsuredAge                "0.9"                 "0.1"            
## CapitalGains              "0"                   "0"              
## CapitalLoss               "0"                   "0"              
## NumberOfVehicles          "0"                   "0"              
## BodilyInjuries            "0"                   "0"              
## AmountOfInjuryClaim       "0.1"                 "0"              
## AmountOfPropertyClaim     "0.1"                 "0.1"            
## AmountOfVehicleDamage     "0.1"                 "0"              
## incident_coverage         "0.1"                 "0.1"            
## daysremainingforinsurance "0.7"                 "0"              
##                           PolicyAnnualPremium UmbrellaLimit InsuredAge
## CustomerLoyaltyPeriod     " "                 " "           " "       
## Policy_Deductible         " "                 " "           " "       
## PolicyAnnualPremium       "1"                 " "           " "       
## UmbrellaLimit             "0"                 "1"           " "       
## InsuredAge                "0"                 "0"           "1"       
## CapitalGains              "0"                 "0"           "0"       
## CapitalLoss               "0"                 "0"           "0"       
## NumberOfVehicles          "-0.1"              "0"           "0"       
## BodilyInjuries            "0"                 "0"           "0"       
## AmountOfInjuryClaim       "0"                 "0"           "0.1"     
## AmountOfPropertyClaim     "0"                 "0"           "0.1"     
## AmountOfVehicleDamage     "0"                 "0"           "0.1"     
## incident_coverage         "0"                 "0"           "0.1"     
## daysremainingforinsurance "0"                 "0"           "0.8"     
##                           CapitalGains CapitalLoss NumberOfVehicles
## CustomerLoyaltyPeriod     " "          " "         " "             
## Policy_Deductible         " "          " "         " "             
## PolicyAnnualPremium       " "          " "         " "             
## UmbrellaLimit             " "          " "         " "             
## InsuredAge                " "          " "         " "             
## CapitalGains              "1"          " "         " "             
## CapitalLoss               "-0.1"       "1"         " "             
## NumberOfVehicles          "0.1"        "0"         "1"             
## BodilyInjuries            "0.1"        "0"         "0"             
## AmountOfInjuryClaim       "0"          "0"         "0.3"           
## AmountOfPropertyClaim     "0"          "0"         "0.2"           
## AmountOfVehicleDamage     "0"          "0"         "0.3"           
## incident_coverage         "0"          "0"         "0"             
## daysremainingforinsurance "0"          "0"         "0"             
##                           BodilyInjuries AmountOfInjuryClaim
## CustomerLoyaltyPeriod     " "            " "                
## Policy_Deductible         " "            " "                
## PolicyAnnualPremium       " "            " "                
## UmbrellaLimit             " "            " "                
## InsuredAge                " "            " "                
## CapitalGains              " "            " "                
## CapitalLoss               " "            " "                
## NumberOfVehicles          " "            " "                
## BodilyInjuries            "1"            " "                
## AmountOfInjuryClaim       "0"            "1"                
## AmountOfPropertyClaim     "0"            "0.6"              
## AmountOfVehicleDamage     "0"            "0.8"              
## incident_coverage         "0"            "0"                
## daysremainingforinsurance "0"            "0.1"              
##                           AmountOfPropertyClaim AmountOfVehicleDamage
## CustomerLoyaltyPeriod     " "                   " "                  
## Policy_Deductible         " "                   " "                  
## PolicyAnnualPremium       " "                   " "                  
## UmbrellaLimit             " "                   " "                  
## InsuredAge                " "                   " "                  
## CapitalGains              " "                   " "                  
## CapitalLoss               " "                   " "                  
## NumberOfVehicles          " "                   " "                  
## BodilyInjuries            " "                   " "                  
## AmountOfInjuryClaim       " "                   " "                  
## AmountOfPropertyClaim     "1"                   " "                  
## AmountOfVehicleDamage     "0.8"                 "1"                  
## incident_coverage         "0"                   "0"                  
## daysremainingforinsurance "0.1"                 "0.1"                
##                           incident_coverage daysremainingforinsurance
## CustomerLoyaltyPeriod     " "               " "                      
## Policy_Deductible         " "               " "                      
## PolicyAnnualPremium       " "               " "                      
## UmbrellaLimit             " "               " "                      
## InsuredAge                " "               " "                      
## CapitalGains              " "               " "                      
## CapitalLoss               " "               " "                      
## NumberOfVehicles          " "               " "                      
## BodilyInjuries            " "               " "                      
## AmountOfInjuryClaim       " "               " "                      
## AmountOfPropertyClaim     " "               " "                      
## AmountOfVehicleDamage     " "               " "                      
## incident_coverage         "1"               " "                      
## daysremainingforinsurance "-0.6"            "1"

###Visualization

Density plot for numeric variables

###Visualizing all the variables V/s the target variable

for(i in 1:14){
  par(mfrow=c(2,2))
  print(ggplot(cbind(numeric_data,final$ReportedFraud), aes_string(x = colnames(numeric_data)[i], fill = "final$ReportedFraud")) +
          geom_density(alpha = 0.5) + 
          labs(title = colnames(numeric_data)[i], 
               x = paste("x",i,sep=""), 
               y = "Density", 
               col = "ReportedFraud"))
}

Binned variables V/s target

library(scorecard)
data = final[,setdiff(names(final),c("InsuredZipCode","IncidentAddress","AmountOfTotalClaim"))]
woe_bins = scorecard::woebin(data, y="ReportedFraud", method="tree",positive = "Y",bin_num_limit = 4)
## [INFO] creating woe binning ...
## Warning in check_datetime_cols(dt): There were 2 date/time columns removed from input dataset,
## DateOfPolicyCoverage, DateOfIncident
data_binned = scorecard::woebin_ply(data,woe_bins)  
## [INFO] converting into woe values ...
plots_demog=scorecard::woebin_plot(woe_bins, x = NULL, title = NULL, show_iv = TRUE)
par(mfrow=c(2,2))
plots_demog
## $VehicleAttributeDetails3

## 
## $VehicleAttributeDetails2

## 
## $VehicleAttributeDetails4

## 
## $CustomerLoyaltyPeriod

## 
## $InsurancePolicyState

## 
## $Policy_CombinedSingleLimit

## 
## $Policy_Deductible

## 
## $PolicyAnnualPremium

## 
## $UmbrellaLimit

## 
## $InsuredRelationship

## 
## $InsuredAge

## 
## $InsuredGender

## 
## $InsuredEducationLevel

## 
## $InsuredOccupation

## 
## $InsuredHobbies

## 
## $CapitalGains

## 
## $CapitalLoss

## 
## $TypeOfIncident

## 
## $TypeOfCollission

## 
## $SeverityOfIncident

## 
## $AuthoritiesContacted

## 
## $IncidentState

## 
## $IncidentCity

## 
## $IncidentTime

## 
## $NumberOfVehicles

## 
## $PropertyDamage

## 
## $BodilyInjuries

## 
## $Witnesses

## 
## $PoliceReport

## 
## $AmountOfInjuryClaim

## 
## $AmountOfPropertyClaim

## 
## $AmountOfVehicleDamage

## 
## $incident_coverage

## 
## $daysremainingforinsurance

Train Test split

set.seed(1234)
final$ReportedFraud = as.factor(final$ReportedFraud)
Train_ID=caret::createDataPartition(final$ReportedFraud,p=0.8,list = F)
Train_data = final[Train_ID,]
Test_data = final[-Train_ID,]
Train_data_binned = data[Train_ID,]
Test_data_binned = data[-Train_ID,]
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following object is masked from 'package:arules':
## 
##     %in%
## The following objects are masked from 'package:data.table':
## 
##     hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 minutes 28 seconds 
##     H2O cluster timezone:       America/Chicago 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.0.1 
##     H2O cluster version age:    5 months and 6 days !!! 
##     H2O cluster name:           H2O_started_from_R_siddarthbalasubramani_yav906 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.92 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.4 (2021-02-15)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (5 months and 6 days)!
## Please download and install the latest version from http://h2o.ai/download/
finaldata.h2o=as.h2o(Train_data)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
finaltest.h2o=as.h2o(Test_data)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
y.dep<-1
x.indep<-2:40
xgboost.model = h2o.xgboost(x = x.indep,
                           y = y.dep,
                           training_frame = finaldata.h2o,
                           validation_frame = finaltest.h2o,
                           booster = "gbtree",
                           seed = 1234,
                           nfolds = 5,
                           distribution="bernoulli",
                           eta = 0.151,
                           max_depth = 10,
                           sample_rate = 1,
                           col_sample_rate=1)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [InsuredHobbies, IncidentState, InsurancePolicyState, SeverityOfIncident, InsuredRelationship, Policy_CombinedSingleLimit, InsuredEducationLevel, IncidentAddress, IncidentCity, TypeOfIncident, AuthoritiesContacted, AmountOfTotalClaim, InsuredOccupation].
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
h2o.auc(h2o.performance(xgboost.model))
## [1] 0.978399
perf <- h2o.performance(xgboost.model, finaltest.h2o)
perf
## H2OBinomialMetrics: xgboost
## 
## MSE:  0.08766438
## RMSE:  0.2960817
## LogLoss:  0.3164162
## Mean Per-Class Error:  0.1230924
## AUC:  0.9042465
## AUCPR:  0.8592338
## Gini:  0.8084931
## R^2:  0.5552127
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           N    Y    Error       Rate
## N      4028  182 0.043230  =182/4210
## Y       316 1241 0.202954  =316/1557
## Totals 4344 1423 0.086353  =498/5767
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.346662    0.832886 191
## 2                       max f2  0.283030    0.820686 215
## 3                 max f0point5  0.439468    0.876628 163
## 4                 max accuracy  0.397878    0.914514 175
## 5                max precision  0.980722    1.000000   0
## 6                   max recall  0.018563    1.000000 397
## 7              max specificity  0.980722    1.000000   0
## 8             max absolute_mcc  0.374774    0.777417 182
## 9   max min_per_class_accuracy  0.224053    0.849711 242
## 10 max mean_per_class_accuracy  0.311276    0.878571 204
## 11                     max tns  0.980722 4210.000000   0
## 12                     max fns  0.980722 1555.000000   0
## 13                     max fps  0.014146 4210.000000 399
## 14                     max tps  0.018563 1557.000000 397
## 15                     max tnr  0.980722    1.000000   0
## 16                     max fnr  0.980722    0.998715   0
## 17                     max fpr  0.014146    1.000000 399
## 18                     max tpr  0.018563    1.000000 397
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.varimp(xgboost.model)
## Variable Importances: 
##                variable relative_importance scaled_importance percentage
## 1 AmountOfVehicleDamage         5327.692383          1.000000   0.093605
## 2   AmountOfInjuryClaim         3475.890625          0.652420   0.061070
## 3 AmountOfPropertyClaim         3307.105469          0.620739   0.058104
## 4        DateOfIncident         3211.359863          0.602768   0.056422
## 5          CapitalGains         2909.274658          0.546067   0.051115
## 
## ---
##                            variable relative_importance scaled_importance
## 189              InsuredZipCode.453            5.791574          0.001087
## 190                 IncidentTime.10            4.459343          0.000837
## 191 VehicleAttributeDetails3.Accord            3.391335          0.000637
## 192              InsuredZipCode.447            0.374069          0.000070
## 193              InsuredZipCode.610            0.238735          0.000045
## 194              InsuredZipCode.430            0.054365          0.000010
##     percentage
## 189   0.000102
## 190   0.000078
## 191   0.000060
## 192   0.000007
## 193   0.000004
## 194   0.000001
glm.model <- h2o.glm(y=y.dep, x=x.indep, training_frame = finaldata.h2o, family = "binomial",nfolds=5)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [InsuredHobbies, IncidentState, InsurancePolicyState, SeverityOfIncident, InsuredRelationship, Policy_CombinedSingleLimit, InsuredEducationLevel, IncidentAddress, IncidentCity, TypeOfIncident, AuthoritiesContacted, AmountOfTotalClaim, InsuredOccupation].
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
h2o.auc(h2o.performance(glm.model))
## [1] 0.7561092
#predict.dl2 <- as.data.frame(h2o.predict(gbm.model, finaltest.h2o))
perf <- h2o.performance(glm.model, finaltest.h2o)
perf
## H2OBinomialMetrics: glm
## 
## MSE:  0.1680515
## RMSE:  0.4099409
## LogLoss:  0.5101557
## Mean Per-Class Error:  0.3186181
## AUC:  0.7408383
## AUCPR:  0.5101469
## Gini:  0.4816767
## R^2:  0.1473484
## Residual Deviance:  5884.136
## AIC:  6246.136
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           N    Y    Error        Rate
## N      3228  982 0.233254   =982/4210
## Y       629  928 0.403982   =629/1557
## Totals 3857 1910 0.279348  =1611/5767
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.328587    0.535333 185
## 2                       max f2  0.150035    0.679771 306
## 3                 max f0point5  0.368664    0.520339 163
## 4                 max accuracy  0.459682    0.754118 117
## 5                max precision  0.898059    1.000000   0
## 6                   max recall  0.017053    1.000000 398
## 7              max specificity  0.898059    1.000000   0
## 8             max absolute_mcc  0.360930    0.345253 167
## 9   max min_per_class_accuracy  0.275199    0.677197 218
## 10 max mean_per_class_accuracy  0.328587    0.681382 185
## 11                     max tns  0.898059 4210.000000   0
## 12                     max fns  0.898059 1556.000000   0
## 13                     max fps  0.012936 4210.000000 399
## 14                     max tps  0.017053 1557.000000 398
## 15                     max tnr  0.898059    1.000000   0
## 16                     max fnr  0.898059    0.999358   0
## 17                     max fpr  0.012936    1.000000 399
## 18                     max tpr  0.017053    1.000000 398
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`